{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-27T00:26:54.238312Z",
     "start_time": "2019-09-27T00:26:52.234675Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "#! -*- coding:utf-8 -*-\n",
    "import re, os, json, codecs, gc\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from random import choice\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import KFold\n",
    "from keras_bert import load_trained_model_from_checkpoint, Tokenizer\n",
    "\n",
    "from keras.layers import *\n",
    "from keras.callbacks import *\n",
    "from keras.models import Model\n",
    "import keras.backend as K\n",
    "from keras.optimizers import Adam"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-27T00:26:55.405550Z",
     "start_time": "2019-09-27T00:26:55.199556Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\r\n",
      "\r\n",
      "\r\n",
      "\r\n",
      "\r\n",
      "\r\n",
      "\r\n",
      "\r\n",
      "\r\n",
      "\r\n"
     ]
    }
   ],
   "source": [
    "!head ../input/task1/train.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-27T00:26:56.936586Z",
     "start_time": "2019-09-27T00:26:56.467897Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "train_df = pd.read_csv('../input/task1/train.csv')\n",
    "test_df = pd.read_csv('../input/task1/test_stage1.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-27T00:26:57.268197Z",
     "start_time": "2019-09-27T00:26:57.189701Z"
    }
   },
   "outputs": [],
   "source": [
    "#! -*- coding:utf-8 -*-\n",
    "import re, os, json, codecs, gc\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from random import choice\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import KFold\n",
    "from keras_bert import load_trained_model_from_checkpoint, Tokenizer\n",
    "\n",
    "from keras.layers import *\n",
    "from keras.callbacks import *\n",
    "from keras.models import Model\n",
    "import keras.backend as K\n",
    "from keras.optimizers import Adam\n",
    "\n",
    "maxlen = 128\n",
    "config_path = '/home/lyz/work/kaggle/bert_workspace/chinese_L-12_H-768_A-12/bert_config.json'\n",
    "# checkpoint_path = '/export/home/liuyuzhong/kaggle/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'\n",
    "checkpoint_path = '/home/lyz/work/kaggle/bert_workspace/chinese_L-12_H-768_A-12/bert_model.ckpt'\n",
    "dict_path = '/home/lyz/work/kaggle/bert_workspace/chinese_L-12_H-768_A-12/vocab.txt'\n",
    "\n",
    "token_dict = {}\n",
    "with codecs.open(dict_path, 'r', 'utf8') as reader:\n",
    "    for line in reader:\n",
    "        token = line.strip()\n",
    "        token_dict[token] = len(token_dict)\n",
    "\n",
    "class OurTokenizer(Tokenizer):\n",
    "    def _tokenize(self, text):\n",
    "        R = []\n",
    "        for c in text:\n",
    "            if c in self._token_dict:\n",
    "                R.append(c)\n",
    "            elif self._is_space(c):\n",
    "                R.append('[unused1]') # space类用未经训练的[unused1]表示\n",
    "            else:\n",
    "                R.append('[UNK]') # 剩余的字符是[UNK]\n",
    "        return R\n",
    "\n",
    "tokenizer = OurTokenizer(token_dict)\n",
    "\n",
    "def seq_padding(X, padding=0):\n",
    "    L = [len(x) for x in X]\n",
    "    ML = max(L)\n",
    "    return np.array([\n",
    "        np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X\n",
    "    ])\n",
    "\n",
    "class data_generator:\n",
    "    def __init__(self, data, batch_size=8, shuffle=True):\n",
    "        self.data = data\n",
    "        self.batch_size = batch_size\n",
    "        self.shuffle = shuffle\n",
    "        self.steps = len(self.data) // self.batch_size\n",
    "        if len(self.data) % self.batch_size != 0:\n",
    "            self.steps += 1\n",
    "    def __len__(self):\n",
    "        return self.steps\n",
    "    def __iter__(self):\n",
    "        while True:\n",
    "            idxs = list(range(len(self.data)))\n",
    "            \n",
    "            if self.shuffle:\n",
    "                np.random.shuffle(idxs)\n",
    "            \n",
    "            X1, X2, Y = [], [], []\n",
    "            for i in idxs:\n",
    "                d = self.data[i]\n",
    "                text = d[0][:maxlen]\n",
    "                x1, x2 = tokenizer.encode(first=text)\n",
    "                y = d[1]\n",
    "                X1.append(x1)\n",
    "                X2.append(x2)\n",
    "                Y.append([y])\n",
    "                if len(X1) == self.batch_size or i == idxs[-1]:\n",
    "                    X1 = seq_padding(X1)\n",
    "                    X2 = seq_padding(X2)\n",
    "                    Y = seq_padding(Y)\n",
    "                    yield [X1, X2], Y[:, 0, :]\n",
    "                    [X1, X2, Y] = [], [], []\n",
    "\n",
    "from keras.metrics import top_k_categorical_accuracy\n",
    "def acc_top2(y_true, y_pred):\n",
    "    return top_k_categorical_accuracy(y_true, y_pred, k=2)\n",
    "                    \n",
    "def build_bert(nclass):\n",
    "    bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)\n",
    "\n",
    "    for l in bert_model.layers:\n",
    "        l.trainable = True\n",
    "\n",
    "    x1_in = Input(shape=(None,))\n",
    "    x2_in = Input(shape=(None,))\n",
    "\n",
    "    x = bert_model([x1_in, x2_in])\n",
    "    x = Lambda(lambda x: x[:, 0])(x)\n",
    "    p = Dense(nclass, activation='softmax')(x)\n",
    "\n",
    "    model = Model([x1_in, x2_in], p)\n",
    "    model.compile(loss='categorical_crossentropy', \n",
    "                  optimizer=Adam(1e-5),\n",
    "                  metrics=['accuracy', acc_top2])\n",
    "    print(model.summary())\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-27T00:26:58.807094Z",
     "start_time": "2019-09-27T00:26:58.487181Z"
    }
   },
   "outputs": [],
   "source": [
    "from keras.utils import to_categorical\n",
    "\n",
    "DATA_LIST = []\n",
    "for data_row in train_df.iloc[:].itertuples():\n",
    "    DATA_LIST.append((data_row.text, to_categorical(data_row.label, 2)))\n",
    "DATA_LIST = np.array(DATA_LIST, dtype=object)\n",
    "\n",
    "DATA_LIST_TEST = []\n",
    "for data_row in test_df.iloc[:].itertuples():\n",
    "    DATA_LIST_TEST.append((data_row.text, to_categorical(0, 2)))\n",
    "DATA_LIST_TEST = np.array(DATA_LIST_TEST,  dtype=object)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-27T00:26:59.710225Z",
     "start_time": "2019-09-27T00:26:59.589719Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mkdir: 无法创建目录\"bert_dump\": 文件已存在\r\n"
     ]
    }
   ],
   "source": [
    "!mkdir bert_dump"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-27T00:27:00.419929Z",
     "start_time": "2019-09-27T00:27:00.407640Z"
    }
   },
   "outputs": [],
   "source": [
    "def run_cv(nfold, data, data_label, data_test):\n",
    "    kf = KFold(n_splits=nfold, shuffle=True, random_state=520).split(data)\n",
    "    train_model_pred = np.zeros((len(data), 2))\n",
    "    test_model_pred = np.zeros((len(data_test), 2))\n",
    "\n",
    "    for i, (train_fold, test_fold) in enumerate(kf):\n",
    "        X_train, X_valid, = data[train_fold, :], data[test_fold, :]\n",
    "        \n",
    "        model = build_bert(2)\n",
    "        early_stopping = EarlyStopping(monitor='val_acc', patience=3)\n",
    "        plateau = ReduceLROnPlateau(monitor=\"val_acc\", verbose=1, mode='max', factor=0.5, patience=2)\n",
    "        checkpoint = ModelCheckpoint('./bert_dump/' + str(i) + '.hdf5', monitor='val_acc', \n",
    "                                         verbose=2, save_best_only=True, mode='max',save_weights_only=True)\n",
    "        \n",
    "        train_D = data_generator(X_train, shuffle=True)\n",
    "        valid_D = data_generator(X_valid, shuffle=True)\n",
    "        test_D = data_generator(data_test, shuffle=False)\n",
    "        \n",
    "        model.fit_generator(\n",
    "            train_D.__iter__(),\n",
    "            steps_per_epoch=len(train_D),\n",
    "            epochs=5,\n",
    "            validation_data=valid_D.__iter__(),\n",
    "            validation_steps=len(valid_D),\n",
    "            callbacks=[early_stopping, plateau, checkpoint],\n",
    "        )\n",
    "        \n",
    "        # model.load_weights('./bert_dump/' + str(i) + '.hdf5')\n",
    "        \n",
    "        # return model\n",
    "        train_model_pred[test_fold, :] =  model.predict_generator(valid_D.__iter__(), steps=len(valid_D),verbose=1)\n",
    "        test_model_pred += model.predict_generator(test_D.__iter__(), steps=len(test_D),verbose=1)\n",
    "        \n",
    "        del model; gc.collect()\n",
    "        K.clear_session()\n",
    "        \n",
    "        # break\n",
    "        \n",
    "    return train_model_pred, test_model_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2019-09-27T00:27:02.107Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Colocations handled automatically by placer.\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.\n",
      "__________________________________________________________________________________________________\n",
      "Layer (type)                    Output Shape         Param #     Connected to                     \n",
      "==================================================================================================\n",
      "input_1 (InputLayer)            (None, None)         0                                            \n",
      "__________________________________________________________________________________________________\n",
      "input_2 (InputLayer)            (None, None)         0                                            \n",
      "__________________________________________________________________________________________________\n",
      "model_2 (Model)                 (None, None, 768)    101677056   input_1[0][0]                    \n",
      "                                                                 input_2[0][0]                    \n",
      "__________________________________________________________________________________________________\n",
      "lambda_1 (Lambda)               (None, 768)          0           model_2[1][0]                    \n",
      "__________________________________________________________________________________________________\n",
      "dense_1 (Dense)                 (None, 2)            1538        lambda_1[0][0]                   \n",
      "==================================================================================================\n",
      "Total params: 101,678,594\n",
      "Trainable params: 101,678,594\n",
      "Non-trainable params: 0\n",
      "__________________________________________________________________________________________________\n",
      "None\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use tf.cast instead.\n",
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:102: div (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Deprecated in favor of operator or tf.math.divide.\n",
      "Epoch 1/5\n",
      "4328/4328 [==============================] - 769s 178ms/step - loss: 0.0247 - acc: 0.9909 - acc_top2: 1.0000 - val_loss: 0.0459 - val_acc: 0.9849 - val_acc_top2: 1.0000\n",
      "\n",
      "Epoch 00002: val_acc improved from 0.98363 to 0.98493, saving model to ./bert_dump/0.hdf5\n",
      "Epoch 3/5\n",
      "2935/4328 [===================>..........] - ETA: 4:00 - loss: 0.0079 - acc: 0.9976 - acc_top2: 1.0000"
     ]
    }
   ],
   "source": [
    "train_model_pred, test_model_pred = run_cv(10, DATA_LIST, None, DATA_LIST_TEST)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-27T01:42:19.650947Z",
     "start_time": "2019-09-27T01:42:19.645201Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4000,)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df['label'] = np.argmax(test_model_pred, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-27T01:43:54.929837Z",
     "start_time": "2019-09-27T01:43:54.914841Z"
    }
   },
   "outputs": [],
   "source": [
    "test_df[['id', 'label']].to_csv('task2_bert_fold1.csv', index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
